# -*- coding: utf-8 -*-
"""
Created on Fri Jun 10 18:17:19 2022
LDA Lemmas file modified to use only the Air Subset, with 10 topics and seed 2
@author: samir
"""



import pandas as pd
import topicmodels
import topicmodels.preprocess
import os
import numpy as np
import matplotlib.pyplot as plt
from wordcloud import WordCloud, STOPWORDS 
from collections import Counter #used for wordcloud
import random #grayscale
from paths import target_path, air_LDA_path #importing target_path from paths.py


def main():
    print("Running: Air LDA Lemmas 10 Topics Seed 2")

if __name__ == "__main__":
    main()

# Change to relevant directory
if os.path.exists(target_path):
    os.chdir(target_path)
else:
    print(f"Warning: {target_path} does not exist!")


#Import data
dataframe = pd.read_csv("Air_Subset/Processed_Data/Air_Complaints_lemmatized.csv", encoding="utf-8")
dataframe = dataframe.replace(np.nan, '', regex=True)
docsobj2 = topicmodels.RawDocs(dataframe.IncidentDescriptionLemma, "long")

#Wordcount for after lemmatization
all_tokens = [s for d in docsobj2.tokens for s in d]
print("number of unique tokens = %d" % len(set(all_tokens)))
print("number of total tokens = %d" % len(all_tokens))

#Preprocess lemmatized data
docsobj2.token_clean(1)
docsobj2.stopword_remove("tokens")

###Change directory to save LDA Outputs in correct folder
os.chdir(air_LDA_path)
docsobj2.term_rank("tokens")

###Plot word frequency measures, decide where to cut off dataset
plt.plot([x[1] for x in docsobj2.tfidf_ranking])
plt.savefig('tf_idf_ranking.png') 
plt.plot([x[1] for x in docsobj2.df_ranking])
plt.savefig('df_ranking.png') 

#Remove words below tf-idf cutoff
docsobj2.rank_remove("tfidf", "tokens", docsobj2.tfidf_ranking[6000][1])

#Wordcount after tf-idf removal
all_tokens = [s for d in docsobj2.tokens for s in d]
print("number of unique tokens = %d" % len(set(all_tokens)))
print("number of total tokens = %d" % len(all_tokens))

###Change directory for correct parameters 
os.chdir('./6000_tfidf_10_Topics_seed_2')
ldaobj = topicmodels.LDA.LDAGibbs(docsobj2.tokens, 10)

#Set seed for replicability
seed = np.loadtxt('seed.csv', dtype = int)
ldaobj.set_seed(seed)

#Set rng seed for reallocation of words to topics
np.random.seed(2001)

###Estimate the model
ldaobj.sample(4000, 50, 80)
perparray = ldaobj.perplexity()
np.savetxt('Perplexity.csv', perparray, delimiter=',')

ldaobj.samples_keep(80)

ldaobj.topic_content(25)

dt = ldaobj.dt_avg()
tt = ldaobj.tt_avg()
ldaobj.dict_print()

for i in range(ldaobj.K):  #xrange switched to range in python 3
    dataframe['T' + str(i)] = dt[:, i]
dataframe.to_csv("lda_final_output.csv", index=False)

#Save as dta
dataframe.set_index('CIN', inplace = True)
dataframe.to_stata('lda_final_output.dta', version = 118)

######################
# Create world clouds
######################
#              Air Subset
# Python program to generate WordCloud 
# Code from: https://www.geeksforgeeks.org/generating-word-cloud-python/ 

###Return to main directory 
os.chdir(target_path)
  
df = pd.read_csv("Air_Subset/LDA_Outputs/df_ranking.csv", header = None)  #read file with ranked words
df.columns = ['word', 'frequency']    #rename columns
stopwords = set(STOPWORDS) #use wordcloud's default stopwords

#Get grayscale colors
def grey_color_func(word, font_size, position, orientation, random_state=None,
                    **kwargs):
    return "hsl(0, 0%%, %d%%)" % random.randint(5, 70)

 # Generate a list reflecting frequency of words
words = []                         
for i in range(len(df.word)):
    for l in range(df.frequency[i]):
        words.append(df.word[i])

word_cloud_dict=Counter(words)   #create dictionary that will work with wordcloud and reflect frequencies 


wordcloud = WordCloud(width = 800, height = 800, 
                background_color ='white', 
                stopwords = stopwords, #uses basic stopwords from wordcloud package
                font_path = 'Air_Subset/wordclouds/Latin-Modern-Roman/lmroman10-regular.otf',
                min_font_size = 10).generate_from_frequencies(word_cloud_dict)

# plot the WordCloud image                        
plt.figure(figsize = (8, 8), facecolor = None) 
plt.imshow(wordcloud.recolor(color_func=grey_color_func, random_state=3),
           interpolation="bilinear")
plt.axis("off") 
plt.tight_layout(pad = 0) 
  
#plt.show() 
#plt.savefig("wordclouds/full_sample.png") #save to file, currently not working
wordcloud.to_file("Air_Subset/wordclouds/Air_subset.png")

#             BY TOPIC
topic_desc = pd.read_csv("Air_Subset/LDA_Outputs/6000_tfidf_10_Topics_seed_2/topic_description.csv", header = None)  #read file with ranked words
topic_desc = pd.DataFrame.transpose(topic_desc)  #transpose rows and columns

names = []    # Add column names
for x in range(10):
    names.append("T" + str(x))
    names.append("T" + str(x) + "freq")
topic_desc.columns = names
topic_desc = topic_desc.iloc[1:]  #drop meaningless row


# Loop through topics, make wordcloud, and save to file
x = 0
for x in range(10):

    df = topic_desc.iloc[:,(2*x):(2*x + 2)]
    
    df.iloc[:,1] = df.iloc[:,1].astype(float) * 1000  #assign weights python can read
    df.iloc[:,1] = df.iloc[:,1].astype(int)
    

    words = []                # Generate a list reflecting frequency of words
    for i in range(len(df.iloc[:,0])):
        for l in range(df.iloc[:,1][i+1]):
            words.append(df.iloc[:,0][i+1])

    word_cloud_dict=Counter(words)   #create dictionary that will work with wordcloud and reflect frequencies 

    wordcloud = WordCloud(width = 800, height = 800, 
                background_color ='white', 
                stopwords = stopwords, #uses basic stopwords from wordcloud package
                font_path = 'Air_Subset/wordclouds/Latin-Modern-Roman/lmroman10-regular.otf',
                min_font_size = 10).generate_from_frequencies(word_cloud_dict)
  
    # plot the WordCloud image                        
    plt.figure(figsize = (8, 8), facecolor = None) 
    plt.imshow(wordcloud.recolor(color_func=grey_color_func, random_state=3),
           interpolation="bilinear")
    plt.axis("off") 
    plt.tight_layout(pad = 0) 
    
    plt.savefig("Air_Subset/wordclouds/10_Topics_Seed_2/air_subset_topic_" + str(x) + ".png") #save to file

  